Data: 1000 restaurants for each city
TODO
Notes
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import os
import scipy as sp
from scipy import stats
from tools.plt import color2d #from the 'srcole/tools' repo
from matplotlib import cm
In [2]:
# Load cities info
df_cities = pd.read_csv('/gh/data2/yelp/city_pop.csv', index_col=0)
df_cities.head()
Out[2]:
In [3]:
# Load restaurants
df_restaurants = pd.read_csv('/gh/data2/yelp/food_by_city/df_restaurants.csv', index_col=0)
df_restaurants.head()
Out[3]:
In [9]:
# Load categories by restaurant
df_categories = pd.read_csv('/gh/data2/yelp/food_by_city/df_categories.csv', index_col=0)
df_categories.head()
Out[9]:
In [10]:
# Manually concatenate categories with at least 500 counts
# Find categories D and V such that category 'D' should be counted as vategory 'V'
category_subsets = {'delis': 'sandwiches',
'sushi': 'japanese',
'icecream': 'desserts',
'cafes': 'coffee',
'sportsbars': 'bars',
'hotdog': 'hotdogs',
'wine_bars': 'bars',
'pubs': 'bars',
'cocktailbars': 'bars',
'beerbar': 'bars',
'tacos': 'mexican',
'gastropubs': 'bars',
'ramen': 'japanese',
'chocolate': 'desserts',
'dimsum': 'chinese',
'cantonese': 'chinese',
'szechuan': 'chinese',
'coffeeroasteries': 'coffee',
'hookah_bars': 'bars',
'irish_pubs': 'bars'}
for k in category_subsets.keys():
df_categories[category_subsets[k]] = np.logical_or(df_categories[k], df_categories[category_subsets[k]])
In [11]:
# Remove some categories # R
category_remove = ['hotdog', 'cafes']
for k in category_remove:
df_categories.drop(k, axis=1, inplace=True)
In [12]:
# Top categories
N = 20
category_counts = df_categories.sum().sort_values(ascending=False)
top_N_categories = list(category_counts.head(N).keys())
top_N_categories_counts = category_counts.head(N).values
category_counts.head(N)
Out[12]:
In [13]:
# Bar chart
plt.figure(figsize=(12,5))
plt.bar(np.arange(N), top_N_categories_counts / len(df_restaurants), color='k', ecolor='.5')
plt.xticks(np.arange(N), top_N_categories)
plt.ylabel('Fraction of restaurants', size=20)
plt.xlabel('Restaurant category', size=20)
plt.xticks(size=15, rotation='vertical')
plt.yticks(size=15);
In [5]:
gb = df_restaurants.groupby('name')
df_chains = gb.mean()[['rating', 'review_count', 'cost']]
df_chains['count'] = gb.size()
df_chains.sort_values('count', ascending=False, inplace=True)
df_chains.head(10)
Out[5]:
In [6]:
# Only consider restaurants with at least 50 locations
min_count = 50
df_temp = df_chains[df_chains['count'] >= min_count]
plt.figure(figsize=(8,12))
plt_num = 1
for i, k1 in enumerate(df_temp.keys()):
for j, k2 in enumerate(df_temp.keys()[i+1:]):
if k1 in ['review_count', 'count']:
if k2 in ['review_count', 'count']:
plot_f = plt.loglog
else:
plot_f = plt.semilogx
else:
if k2 in ['review_count', 'count']:
plot_f = plt.semilogy
else:
plot_f = plt.plot
plt.subplot(3, 2, plt_num)
plot_f(df_temp[k1], df_temp[k2], 'k.')
plt.xlabel(k1)
plt.ylabel(k2)
plt_num += 1
r, p = stats.spearmanr(df_temp[k1], df_temp[k2])
plt.title(r)
plt.tight_layout()
In [28]:
from bokeh.io import output_notebook
from bokeh.layouts import row, widgetbox
from bokeh.models import CustomJS, Slider, Legend, HoverTool
from bokeh.plotting import figure, output_file, show, ColumnDataSource
output_notebook()
# Slider variables
min_N_franchises = 100
# Determine dataframe sources
df_chains2 = df_chains[df_chains['count'] > 10].reset_index()
df_temp = df_chains2[df_chains2['count'] >= min_N_franchises]
# Create data source for plotting and Slider callback
source1 = ColumnDataSource(df_temp, id='source1')
source2 = ColumnDataSource(df_chains2, id='source2')
hover = HoverTool(tooltips=[
("Name", "@name"),
("Avg Stars", "@rating"),
("# locations", "@count")])
# Make initial figure of net income vs years of saving
plot = figure(plot_width=400, plot_height=400,
x_axis_label='Number of locations',
y_axis_label='Average rating',
x_axis_type="log", tools=[hover])
plot.scatter('count', 'rating', source=source1, line_width=3, line_alpha=0.6, line_color='black')
# Declare how to update plot on slider change
callback = CustomJS(args=dict(s1=source1, s2=source2), code="""
var d1 = s1.get("data");
var d2 = s2.get("data");
var N = N.value;
d1["count"] = [];
d1["rating"] = [];
for(i=0;i <=d2["count"].length; i++){
if (d2["count"][i] >= N) {
d1["count"].push(d2["count"][i]);
d1["rating"].push(d2["rating"][i]);
d1["name"].push(d2["name"][i]);
}
}
s1.change.emit();
""")
N_slider = Slider(start=10, end=1000, value=min_N_franchises, step=10,
title="minimum number of franchises", callback=callback)
callback.args["N"] = N_slider
# Define layout of plot and sliders
layout = row(plot, widgetbox(N_slider))
# Output and show
output_file("/gh/srcole.github.io/assets/misc/yelp_bokeh.html", title="Yelp WIP")
show(layout)
In [13]:
N_bins_per_factor10 = 8
bins_by_key = {'rating': np.arange(0.75, 5.75, .5),
'review_count': np.logspace(1, 5, num=N_bins_per_factor10*4+1),
'cost': np.arange(.5, 5, 1)}
log_by_key = {'rating': False,
'review_count': True,
'cost': False}
plt.figure(figsize=(12, 4))
for i, k in enumerate(bins_by_key.keys()):
weights = np.ones_like(df_restaurants[k].values)/float(len(df_restaurants[k].values))
plt.subplot(1, 3, i+1)
plt.hist(df_restaurants[k].values, bins_by_key[k], log=log_by_key[k],
color='k', edgecolor='.5', weights=weights)
if k == 'review_count':
plt.semilogx(1,1)
plt.xlim((10, 40000))
elif i == 0:
plt.ylabel('Probability')
plt.xlabel(k)
plt.tight_layout()
In [14]:
# Prepare histogram analysis
gb_cost = df_restaurants.groupby('cost').groups
gb_rating = df_restaurants.groupby('rating').groups
# Remove 0 from gb_rating
gb_rating.pop(0.0)
N_bins_cost = len(gb_cost.keys())
N_bins_count = len(bins_by_key['review_count']) - 1
N_bins_rate = len(bins_by_key['rating']) - 1
# Hist: review count and rating as fn of cost
hist_count_by_cost = np.zeros((N_bins_cost, N_bins_count))
hist_rate_by_cost = np.zeros((N_bins_cost, N_bins_rate))
points_count_by_cost = np.zeros((N_bins_cost, 3))
points_rate_by_cost = np.zeros((N_bins_cost, 3))
for i, k in enumerate(gb_cost.keys()):
# Make histogram of review count as fn of cost
x = df_restaurants.loc[gb_cost[k]]['review_count'].values
hist_temp, _ = np.histogram(x, bins=bins_by_key['review_count'])
# Make each cost sum to 1
hist_count_by_cost[i] = hist_temp / np.sum(hist_temp)
# Compute percentiles
points_count_by_cost[i,0] = np.mean(x)
points_count_by_cost[i,1] = np.std(x)
points_count_by_cost[i,2] = np.min([np.std(x), 5-np.mean(x)])
# Repeat for rating
x = df_restaurants.loc[gb_cost[k]]['rating'].values
hist_temp, _ = np.histogram(x, bins=bins_by_key['rating'])
hist_rate_by_cost[i] = hist_temp / np.sum(hist_temp)
points_rate_by_cost[i,0] = np.mean(x)
points_rate_by_cost[i,1] = np.std(x)
points_rate_by_cost[i,2] = np.min([np.std(x), 5-np.mean(x)])
# Make histograms of review count as fn of rating
hist_count_by_rate = np.zeros((N_bins_rate, N_bins_count))
points_count_by_rate = np.zeros((N_bins_rate, 3))
for i, k in enumerate(gb_rating.keys()):
# Make histogram of review count as fn of cost
x = df_restaurants.loc[gb_rating[k]]['review_count'].values
hist_temp, _ = np.histogram(x, bins=bins_by_key['review_count'])
# Make each cost sum to 1
hist_count_by_rate[i] = hist_temp / np.sum(hist_temp)
points_count_by_rate[i,0] = np.mean(x)
points_count_by_rate[i,1] = np.std(x)
points_count_by_rate[i,2] = np.min([np.std(x), 5-np.mean(x)])
In [15]:
# Make a 2d colorplot
plt.figure(figsize=(10,4))
color2d(hist_rate_by_cost, cmap=cm.viridis,
clim=[0,.4], cticks = np.arange(0,.41,.05), color_label='Probability',
plot_xlabel='Rating', plot_ylabel='Cost ($)',
plot_xticks_locs=range(N_bins_rate), plot_xticks_labels=gb_rating.keys(),
plot_yticks_locs=range(N_bins_cost), plot_yticks_labels=gb_cost.keys(),
interpolation='none', fontsize_minor=14, fontsize_major=19)
# On top, plot the mean and st. dev.
# plt.errorbar(points_rate_by_cost[:,0] / , np.arange(N_bins_cost), fmt='.', color='w', ms=10,
# xerr=points_rate_by_cost[:,1:].T, ecolor='w', alpha=.5)
In [16]:
# Make a 2d colorplot
xbins_label = np.arange(0,N_bins_per_factor10*2+1, N_bins_per_factor10)
plt.figure(figsize=(10,4))
color2d(hist_count_by_cost, cmap=cm.viridis,
clim=[0,.2], cticks = np.arange(0,.21,.05), color_label='Probability',
plot_xlabel='Number of reviews', plot_ylabel='Cost ($)',
plot_xticks_locs=xbins_label, plot_xticks_labels=bins_by_key['review_count'][xbins_label].astype(int),
plot_yticks_locs=range(N_bins_cost), plot_yticks_labels=gb_cost.keys(),
interpolation='none', fontsize_minor=14, fontsize_major=19)
plt.xlim((-.5,N_bins_per_factor10*2 + .5))
Out[16]:
In [17]:
# Make a 2d colorplot
xbins_label = np.arange(0,N_bins_per_factor10*2+1, N_bins_per_factor10)
plt.figure(figsize=(10,6))
color2d(hist_count_by_rate, cmap=cm.viridis,
clim=[0,.4], cticks = np.arange(0,.41,.1), color_label='Probability',
plot_xlabel='Number of reviews', plot_ylabel='Rating',
plot_xticks_locs=xbins_label, plot_xticks_labels=bins_by_key['review_count'][xbins_label].astype(int),
plot_yticks_locs=range(N_bins_rate), plot_yticks_labels=gb_rating.keys(),
interpolation='none', fontsize_minor=14, fontsize_major=19)
plt.xlim((-.5,N_bins_per_factor10*2 + .5))
Out[17]: